The following notebook contains some of our eperimentations on the context encoder presented by Deepak Pathak et al. The paper as well as pre-trained networks and information on how to use them can be found from the context encoder GitHub: https://people.eecs.berkeley.edu/~pathak/context_encoder/
require 'image'
require 'nn'
torch.setdefaulttensortype('torch.FloatTensor')
First let us load the pre-trained networks from the paper. The following cell requires the models to have been downloaded and extracted from the author webpage (http://www.cs.berkeley.edu/~pathak/context_encoder/resources/inpaintCenterModels.tar.gz) by running the following shell command from inside the context-encoder folder : bash ./models/scripts/download_inpaintCenter_models.sh
--load pre-trained network trained on paris_street_view images
netParis = torch.load('./context-encoder/models/inpaintCenter/paris_inpaintCenter.t7')
netParis:apply(function(m) if m.weight then
m.gradWeight = m.weight:clone():zero();
m.gradBias = m.bias:clone():zero(); end end)
netParis:evaluate()
netParis:float()
--load pre-trained network trained on imagenet images
netImagenet = torch.load('./context-encoder/models/inpaintCenter/imagenet_inpaintCenter.t7')
netImagenet:apply(function(m) if m.weight then
m.gradWeight = m.weight:clone():zero();
m.gradBias = m.bias:clone():zero(); end end)
netImagenet:evaluate()
netImagenet:float()
print(netParis)
As we can see, the context encoder is composed of two sub-networks: a 5 hidden layers encoder (corresponding to the first, second and third modules of the torch nn.sequential object) and a 4 hidden layers decoder (corresponding to the 14 others modules of the torch nn.sequential object). The encoder, encode the context of the missing region into 4000 features (the ouptut of the third module)
Let us test the pre-loaded networks on several images.
--First, let us load some images
inputSize = 128
image_ctx = torch.Tensor(3, 3, inputSize, inputSize)
input_image_ctx = torch.Tensor(3, 3, inputSize, inputSize)
--Loading an image from the paris street view dataset
local input = image.load('./context-encoder/images/paris/021_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[1]:copy(input)
--Loading an image from the imagenet dataset
local input = image.load('./context-encoder/images/imagenet/020_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[2]:copy(input)
--Loading an image from ucberkeley dataset
local input = image.load('./context-encoder/images/ucberkeley/004_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[3]:copy(input)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
--Let us remove a center region from each input image
real_center = image_ctx[{{},{},{1 + inputSize/4, inputSize/2 + inputSize/4},{1 + inputSize/4, inputSize/2 + inputSize/4}}]:clone() -- copy by value
-- fill center region with mean value
image_ctx[{{},{1},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*117.0/255.0 - 1.0
image_ctx[{{},{2},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*104.0/255.0 - 1.0
image_ctx[{{},{3},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*123.0/255.0 - 1.0
input_image_ctx:copy(image_ctx)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
-- run the context-encoder trained on the paris street view data set to inpaint center
pred_center_paris = netParis:forward(input_image_ctx)
print('Prediction: size: ', pred_center_paris:size(1)..' x '..pred_center_paris:size(2) ..' x '..pred_center_paris:size(3)..' x '..pred_center_paris:size(4))
-- run the context-encoder trained on the paris street view data set to inpaint center
pred_center_imagenet = netImagenet:forward(input_image_ctx)
print('Prediction: size: ', pred_center_imagenet:size(1)..' x '..pred_center_imagenet:size(2) ..' x '..pred_center_imagenet:size(3)..' x '..pred_center_imagenet:size(4))
Note that the contet encoder takes as input the whole image (with the missing center region) and returns an output of the size of the center region: 3 x 64 x 64 (and not of the same size as the input, as one could expect from a auto-encoder).
-- Prediction from paris street view context-encoder
image_ctx = input_image_ctx:clone()
-- paste predicted center in the context
image_ctx[{{},{},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}]:copy(pred_center_paris[{{},{},{1 + 4, inputSize/2 - 4},{1 + 4, inputSize/2 - 4}}])
-- re-transform scale back to normal
image_ctx:add(1):mul(0.5)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
-- Prediction from imagenet context-encoder
image_ctx = input_image_ctx:clone()
-- paste predicted center in the context
image_ctx[{{},{},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}]:copy(pred_center_imagenet[{{},{},{1 + 4, inputSize/2 - 4},{1 + 4, inputSize/2 - 4}}])
-- re-transform scale back to normal
image_ctx:add(1):mul(0.5)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
Note that each context-encoder performs slightly better on images of the same kind as the one in the data set it was trained on. Hence, the first context-encoder (frst raw) performs better on paris street view images (first image) than the second context-encoder (second raw), which itself performs better on images from the imagenet dataset (second image).
--Load the image in the same way as before
--This time we will fill the masked region with a 0 value instead of a mean value.
--First, let us load some images
inputSize = 128
image_ctx = torch.Tensor(3, 3, inputSize, inputSize)
input_image_ctx = torch.Tensor(3, 3, inputSize, inputSize)
--Loading an image from the paris street view dataset
local input = image.load('./context-encoder/images/paris/021_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[1]:copy(input)
--Loading an image from the imagenet dataset
local input = image.load('./context-encoder/images/imagenet/020_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[2]:copy(input)
--Loading an image from ucberkeley dataset
local input = image.load('./context-encoder/images/ucberkeley/004_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[3]:copy(input)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
--Let us remove a center region from each input image
real_center = image_ctx[{{},{},{1 + inputSize/4, inputSize/2 + inputSize/4},{1 + inputSize/4, inputSize/2 + inputSize/4}}]:clone() -- copy by value
-- fill center region with mean value
---this time with 0 values
image_ctx[{{},{1},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 0--2*117.0/255.0 - 1.0
image_ctx[{{},{2},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 0--2*104.0/255.0 - 1.0
image_ctx[{{},{3},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 0--2*123.0/255.0 - 1.0
input_image_ctx:copy(image_ctx)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
-- run the context-encoder trained on the paris street view data set to inpaint center
pred_center_paris = netParis:forward(input_image_ctx)
print('Prediction: size: ', pred_center_paris:size(1)..' x '..pred_center_paris:size(2) ..' x '..pred_center_paris:size(3)..' x '..pred_center_paris:size(4))
-- run the context-encoder trained on the paris street view data set to inpaint center
pred_center_imagenet = netImagenet:forward(input_image_ctx)
print('Prediction: size: ', pred_center_imagenet:size(1)..' x '..pred_center_imagenet:size(2) ..' x '..pred_center_imagenet:size(3)..' x '..pred_center_imagenet:size(4))
-- Prediction from paris street view context-encoder
image_ctx = input_image_ctx:clone()
-- paste predicted center in the context
image_ctx[{{},{},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}]:copy(pred_center_paris[{{},{},{1 + 4, inputSize/2 - 4},{1 + 4, inputSize/2 - 4}}])
-- re-transform scale back to normal
image_ctx:add(1):mul(0.5)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
-- Prediction from imagenet context-encoder
image_ctx = input_image_ctx:clone()
-- paste predicted center in the context
image_ctx[{{},{},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}]:copy(pred_center_imagenet[{{},{},{1 + 4, inputSize/2 - 4},{1 + 4, inputSize/2 - 4}}])
-- re-transform scale back to normal
image_ctx:add(1):mul(0.5)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 3*inputSize)
for i=1,3 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 3*256, 256)
itorch.image(viz)
We note that changing the color of the masked region has an impact on the context-encoder outputs (hence the latter is not independant from the masked region).
In seems that the model performs quite well on simple structures, such as straight lines and regular curves. Let us test our two context-encoders on binary images with this kind on shapes.
--First, let us load some images
inputSize = 128
image_ctx = torch.Tensor(7, 3, inputSize, inputSize)
input_image_ctx = torch.Tensor(7, 3, inputSize, inputSize)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
image_ctx[1] = img
itorch.image(img)
img = torch.Tensor(3, inputSize, inputSize):zero() --the context encoder needs three channels as input
img[{{}, {}, {1,inputSize/2}}] = torch.ones(3, inputSize, inputSize/2)
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
image_ctx[2] = img
itorch.image(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {}}] = torch.zeros(3, 3, inputSize)
image_ctx[3] = img
itorch.image(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {inputSize/2+1, inputSize}}] = torch.zeros(3, 3, inputSize/2)
img[{{}, {inputSize/2-1, inputSize}, {inputSize/2-1, inputSize/2 +1}}] = torch.zeros(3, inputSize/2+2, 3)
image_ctx[4] = img
itorch.image(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {}}] = torch.zeros(3, 3, inputSize)
img[{{}, {}, {inputSize/2-1, inputSize/2 +1}}] = torch.zeros(3, inputSize, 3)
image_ctx[5] = img
itorch.image(img)
img = torch.Tensor(3, inputSize, inputSize):zeros(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{1}, {inputSize/2+1, inputSize}, {inputSize/2+1, inputSize}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{2}, {inputSize/2+1, inputSize}, {1, inputSize/2}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{3}, {1, inputSize/2}, {inputSize/2+1, inputSize}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{}, {1, inputSize/2}, {1, inputSize/2}}] = torch.ones(3, inputSize/2, inputSize/2)
image_ctx[6] = img
itorch.image(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
R = 2048
xc = 3*inputSize/4
yc = inputSize/2
for x=1,inputSize do
for y=1,inputSize do
tmp = (x-xc)*(x-xc) + (y-yc)*(y-yc)
if tmp<R then img[{{}, {x}, {y}}] = 0 end
end
end
image_ctx[7] = img
itorch.image(img)
image_ctx:mul(2):add(-1)
image_ctx[{{},{1},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*117.0/255.0 - 1.0
image_ctx[{{},{2},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*104.0/255.0 - 1.0
image_ctx[{{},{3},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*123.0/255.0 - 1.0
input_image_ctx:copy(image_ctx)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 7*inputSize)
for i=1,7 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
itorch.image(viz)
-- run the context-encoder trained on the paris street view data set to inpaint center
pred_center_paris = netParis:forward(input_image_ctx)
-- run the context-encoder trained on the paris street view data set to inpaint center
pred_center_imagenet = netImagenet:forward(input_image_ctx)
-- Prediction from paris street view context-encoder
image_ctx = input_image_ctx:clone()
-- paste predicted center in the context
image_ctx[{{},{},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}]:copy(pred_center_paris[{{},{},{1 + 4, inputSize/2 - 4},{1 + 4, inputSize/2 - 4}}])
-- re-transform scale back to normal
image_ctx:add(1):mul(0.5)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 7*inputSize)
for i=1,7 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
itorch.image(viz)
-- Prediction from imagenet context-encoder
image_ctx = input_image_ctx:clone()
-- paste predicted center in the context
image_ctx[{{},{},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}]:copy(pred_center_imagenet[{{},{},{1 + 4, inputSize/2 - 4},{1 + 4, inputSize/2 - 4}}])
-- re-transform scale back to normal
image_ctx:add(1):mul(0.5)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 7*inputSize)
for i=1,7 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
itorch.image(viz)
We note that the network trained on imagenet images performs better on these simple shapes.
--First, let us load some images
inputSize = 128
image_ctx = torch.Tensor(4, 3, inputSize, inputSize)
input_image_ctx = torch.Tensor(4, 3, inputSize, inputSize)
--Loading an image from the paris street view dataset
local input = image.load('./context-encoder/images/paris/021_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[1]:copy(input)
--Loading an image from the paris street view dataset
local input = image.load('./context-encoder/images/paris/005_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[2]:copy(input)
--Loading an image from the imagenet dataset
local input = image.load('./context-encoder/images/imagenet/020_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[3]:copy(input)
--Loading an image from the imagenet dataset
local input = image.load('./context-encoder/images/imagenet/005_im.png', 3, 'float')
input = image.scale(input, inputSize, inputSize)
input:mul(2):add(-1)
image_ctx[4]:copy(input)
--image_ctx:add(1):mul(0.5)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 4*inputSize)
for i=1,4 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 4*256, 256)
itorch.image(viz)
--Let us remove a center region from each input image
local real_center = image_ctx[{{},{},{1 + inputSize/4, inputSize/2 + inputSize/4},{1 + inputSize/4, inputSize/2 + inputSize/4}}]:clone() -- copy by value
-- fill center region with mean value
image_ctx[{{},{1},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*117.0/255.0 - 1.0
image_ctx[{{},{2},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*104.0/255.0 - 1.0
image_ctx[{{},{3},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*123.0/255.0 - 1.0
input_image_ctx:copy(image_ctx)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 4*inputSize)
for i=1,4 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 4*256, 256)
itorch.image(viz)
Now let us try to analyse the importance of each layer of the context-encoder, by randomly modifying its weihgts. But first, let us have a deeper look into the context-encoder architecture:
function loadNet(path)
net = torch.load(path)
net:apply(function(m) if m.weight then
m.gradWeight = m.weight:clone():zero();
m.gradBias = m.bias:clone():zero(); end end)
net:evaluate()
net:float()
return net
end
function inpainting(net, input_image_ctx)
pred_center = net:forward(input_image_ctx)
nIm = input_image_ctx:size(1)
-- Prediction from paris street view context-encoder
image_ctx = input_image_ctx:clone()
-- paste predicted center in the context
image_ctx[{{},{},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}]:copy(pred_center[{{},{},{1 + 4, inputSize/2 - 4},{1 + 4, inputSize/2 - 4}}])
-- re-transform scale back to normal
image_ctx:add(1):mul(0.5)
--return image_ctx
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, nIm*inputSize)
for i=1,nIm do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, nIm*256, 256)
itorch.image(viz)
end
net = loadNet('./context-encoder/models/inpaintCenter/paris_inpaintCenter.t7')
inpainting(net, input_image_ctx)
--randomize the weights of the layer m
function random_weights(m)
local name = torch.type(m)
if name:find('Convolution') then
nb_kernel = m.weight:size(2)
for i=1,nb_kernel do
mean = m.weight[{{}, {i}, {}, {}}]:mean()
std = m.weight[{{}, {i}, {}, {}}]:std()
m.weight[{{}, {i}, {}, {}}]:normal(mean, std)
end
end
end
--print the results obtained when randomizing the weights of different convolutional layers
-- (the other parameters being pre-trained)
-- assume that the network cannot have sequence in a module of a sequence in a module of a sequence
-- only work if the network has a total of 11 convolutional layers.
function randomizeWeights(path, input_image_ctx)
net = loadNet(path)
L = net:size()
for i=1,L do
m = net:get(i)
if torch.type(m):find('Sequential') then
subL = m:size()
for j=1,subL do
n = m:get(j)
local name = torch.type(n)
if name:find('Convolution') then
random_weights(n)
inpainting(net, input_image_ctx)
net = loadNet(path)
m = net:get(i)
end
end
else
local name = torch.type(m)
if name:find('Convolution') then
random_weights(m)
inpainting(net, input_image_ctx)
net = loadNet(path)
end
end
end
end
randomizeWeights('./context-encoder/models/inpaintCenter/paris_inpaintCenter.t7', input_image_ctx)
randomizeWeights('./context-encoder/models/inpaintCenter/imagenet_inpaintCenter.t7', input_image_ctx)
Let us try to add another non masked region, but of the same region as the mask.
--First, create again binary images
inputSize = 128
image_ctx = torch.Tensor(7, 3, inputSize, inputSize)
input_image_ctx = torch.Tensor(7, 3, inputSize, inputSize)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
image_ctx[1]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):zero() --the context encoder needs three channels as input
img[{{}, {}, {1,inputSize/2}}] = torch.ones(3, inputSize, inputSize/2)
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
image_ctx[2]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {}}] = torch.zeros(3, 3, inputSize)
image_ctx[3]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {inputSize/2+1, inputSize}}] = torch.zeros(3, 3, inputSize/2)
img[{{}, {inputSize/2-1, inputSize}, {inputSize/2-1, inputSize/2 +1}}] = torch.zeros(3, inputSize/2+2, 3)
image_ctx[4]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {}}] = torch.zeros(3, 3, inputSize)
img[{{}, {}, {inputSize/2-1, inputSize/2 +1}}] = torch.zeros(3, inputSize, 3)
image_ctx[5]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):zeros(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{1}, {inputSize/2+1, inputSize}, {inputSize/2+1, inputSize}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{2}, {inputSize/2+1, inputSize}, {1, inputSize/2}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{3}, {1, inputSize/2}, {inputSize/2+1, inputSize}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{}, {1, inputSize/2}, {1, inputSize/2}}] = torch.ones(3, inputSize/2, inputSize/2)
image_ctx[6]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
R = 2048
xc = 3*inputSize/4
yc = inputSize/2
for x=1,inputSize do
for y=1,inputSize do
tmp = (x-xc)*(x-xc) + (y-yc)*(y-yc)
if tmp<R then img[{{}, {x}, {y}}] = 0 end
end
end
image_ctx[7]:copy(img)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 7*inputSize)
for i=1,7 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 7*256, 256)
itorch.image(viz)
image_ctx:mul(2):add(-1)
-- fill a non masked region with mean value
image_ctx[{{},{1},{10 , inputSize/4},{inputSize/4, inputSize/2 + inputSize/4}}] = 2*117.0/255.0 - 1.0
image_ctx[{{},{2},{10 , inputSize/4},{inputSize/4, inputSize/2 + inputSize/4}}] = 2*104.0/255.0 - 1.0
image_ctx[{{},{3},{10 , inputSize/4},{inputSize/4, inputSize/2 + inputSize/4}}] = 2*123.0/255.0 - 1.0
--Let us add a non masked region of the same color as the mask.
image_ctx[{{},{1},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*117.0/255.0 - 1.0
image_ctx[{{},{2},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*104.0/255.0 - 1.0
image_ctx[{{},{3},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4},{1 + inputSize/4 + 4, inputSize/2 + inputSize/4 - 4}}] = 2*123.0/255.0 - 1.0
input_image_ctx:copy(image_ctx)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 7*inputSize)
for i=1,7 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 7*256, 256)
itorch.image(viz)
net = loadNet('./context-encoder/models/inpaintCenter/paris_inpaintCenter.t7')
inpainting(net, input_image_ctx)
net = loadNet('./context-encoder/models/inpaintCenter/imagenet_inpaintCenter.t7')
inpainting(net, input_image_ctx)
We note that the modified region did not have a big impact on the result. However, when comparing with the results we obtained earlier on the non altered images, we notice that the reconstruction has been slightly influenced (especially for the angle and the cross).
Finally, let us try to feed as the whole unmasked image as input to the model.
--First, let us load some images
inputSize = 128
image_ctx = torch.Tensor(7, 3, inputSize, inputSize)
input_image_ctx = torch.Tensor(7, 3, inputSize, inputSize)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
image_ctx[1]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):zero() --the context encoder needs three channels as input
img[{{}, {}, {1,inputSize/2}}] = torch.ones(3, inputSize, inputSize/2)
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
image_ctx[2]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {}}] = torch.zeros(3, 3, inputSize)
image_ctx[3]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {inputSize/2+1, inputSize}}] = torch.zeros(3, 3, inputSize/2)
img[{{}, {inputSize/2-1, inputSize}, {inputSize/2-1, inputSize/2 +1}}] = torch.zeros(3, inputSize/2+2, 3)
image_ctx[4]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize/2-1, inputSize/2 +1}, {}}] = torch.zeros(3, 3, inputSize)
img[{{}, {}, {inputSize/2-1, inputSize/2 +1}}] = torch.zeros(3, inputSize, 3)
image_ctx[5]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):zeros(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
img[{{1}, {inputSize/2+1, inputSize}, {inputSize/2+1, inputSize}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{2}, {inputSize/2+1, inputSize}, {1, inputSize/2}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{3}, {1, inputSize/2}, {inputSize/2+1, inputSize}}] = torch.ones(1, inputSize/2, inputSize/2)
img[{{}, {1, inputSize/2}, {1, inputSize/2}}] = torch.ones(3, inputSize/2, inputSize/2)
image_ctx[6]:copy(img)
img = torch.Tensor(3, inputSize, inputSize):ones(3, inputSize, inputSize) --the context encoder needs three channels as input
img[{{}, {1}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {1}}] = torch.zeros(3, inputSize)
img[{{}, {inputSize}, {}}] = torch.zeros(3, inputSize)
img[{{}, {}, {inputSize}}] = torch.zeros(3, inputSize)
R = 2048
xc = 3*inputSize/4
yc = inputSize/2
for x=1,inputSize do
for y=1,inputSize do
tmp = (x-xc)*(x-xc) + (y-yc)*(y-yc)
if tmp<R then img[{{}, {x}, {y}}] = 0 end
end
end
image_ctx[7]:copy(img)
image_ctx:mul(2):add(-1)
input_image_ctx:copy(image_ctx)
-- vizualizing images one next to the other
viz = torch.Tensor(3, inputSize, 7*inputSize)
for i=1,7 do
viz[{{},{},{(i-1)*inputSize+1, i*inputSize}}]:copy(image_ctx[i])
end
viz = image.scale(viz, 7*256, 256)
itorch.image(viz)
inpainting(net, input_image_ctx)
This results may us wonder about the ability of the context encoder to perform inpainting with masks of random shapes, positions and sizes. Indeed, the latter seems that have quite an influence on the results of the model.